# get metadata for matches of token
# scenario: id of the texts with occurrence of 'oil'
token_to_get <- "oil"
token_id <- cl_str2id("REUTERS", p_attribute = "word", str = "oil", get_tmp_registry())
token_cpos <- cl_id2cpos("REUTERS", p_attribute = "word", id = token_id, get_tmp_registry())
strucs <- cl_cpos2struc("REUTERS", s_attribute = "id", cpos = token_cpos, get_tmp_registry())
strucs_unique <- unique(strucs)
text_ids <- cl_struc2str("REUTERS", s_attribute = "id", struc = strucs_unique, get_tmp_registry())
# get the full text of the first text with match for 'oil'
left_cpos <- cl_cpos2lbound(
"REUTERS", s_attribute = "id",
cpos = min(token_cpos),
registry = get_tmp_registry()
)
right_cpos <- cl_cpos2rbound(
"REUTERS",
s_attribute = "id",
cpos = min(token_cpos),
registry = get_tmp_registry()
)
txt <- cl_cpos2str(
"REUTERS", p_attribute = "word",
cpos = left_cpos:right_cpos,
registry = get_tmp_registry()
)
fulltext <- paste(txt, collapse = " ")
# alternativ approach to achieve same result
first_struc_match_oil <- cl_cpos2struc(
"REUTERS", s_attribute = "id",
cpos = min(token_cpos),
registry = get_tmp_registry()
)
cpos_struc <- cl_struc2cpos(
"REUTERS", s_attribute = "id",
struc = first_struc_match_oil,
registry = get_tmp_registry()
)
txt <- cl_cpos2str(
"REUTERS",
p_attribute = "word",
cpos = cpos_struc[1]:cpos_struc[2],
registry = get_tmp_registry()
)
fulltext <- paste(txt, collapse = " ")
Run the code above in your browser using DataLab